#Run this to generate mean imputed data


source("source_mean_imputation.R")




# Start data analysis --------------------------------------------------------------
rm(list = ls())
gc()

library(lubridate)
library(tidyverse)
library(foreign)
library(ggplot2)
library(ggthemes)
library(estimatr)
library(PRROC)
library(pROC)
library(glmnet)

df = readRDS("data/baseline_clean_mean.rds")

covars = c("syr_origin", "syr_origin_urban", "location_its", "sick", "head_educ2", "toddler", "elderly", "female_headed_hh", "location_district", "hezb_control")
covars2 = c("syr_origin_urban", "location_its", "sick", "head_educ2", "toddler", "elderly", "female_headed_hh", "hezb_control")

df[,covars] %>% summary
df[,covars2] %>% summary

source("functions/effect_plotter2.R")



# Define inputs -----------------------------------------------------------
leb_inputs = c("head_work_legal", "head_work_days_4", "head_work_hours_4", "head_income",
               "hh_inc_source_aid", "aid_atm", "aid_wfp", "hh_leb_2011", "hh_inc_stable", 
               "aid_chg", "own_items_1", "own_items_2", "own_items_3", "own_items_4", "own_items_5",
               "own_items_6", "own_items_7", "own_items_8", "own_items_10", "own_items_11",
               "expenses_ability", "assets_value", "assets_months", "hh_income",
               #
               "ipl_connected", "ipl_outsider", "ipl_arabic", "ipl_literate", "ipl_politics", "ipl_discuss", 
               "ipl_job", "length_stay", "curfews_never", 
               "no_curfews_now", "ind_public_treat", "ind_auth_treat", "not_detained", "no_disc_housing", 
               "ind_mobility", "ind_mobility_hh",
               #
               "head_not_sick", "head_treated", "person_nosick", "person_treat", 
               "ipl_doctor", "disc_healthcare", "no_need_school", 
               "stability_towns", "stability_current", "scl_not_preventive", "access_legal", "own_items_9",
               #
               "registered", "resident",
               #
               "hh_syr_leb", "ipl_lebanese", "ipl_syrian", "leban_relatives", "ipl_meal")

df[,leb_inputs] %>% summary

syr_inputs = c("safety_current", "oppsn_sympathy", "protests", "exposure_violence",
               "safe_future", "conscription",
               #
               "control_now_regime", "control_now_oppsn", "control_now_russia", "control_now_turkey",
               "control_now_kurds", "contested_now", "control_past_oppsn", "control_past_russia", 
               "control_past_regime", "control_past_kurds", "control_past_turkey", "contested_past", "isis_control",
               #
               "jobs_origin", "debt_fin", "syr_property_1", "syr_property_2", "syr_property_3", 
               "syr_land_future", "syr_home_doc",
               #
               "services_syr_elect", "services_syr_water", "services_syr_scl", 
               "services_syr_health", "services_syr_improve",
               #
               "syr_stay", "relatives_rtrn1", "relatives_rtrn2", "hh_rtrn")

df[,syr_inputs] %>% summary

leb_indices = c('index_econ_leb', 'index_soc_leb', 'index_services_leb',
                'index_legal_leb', 'index_networks_lebanon')

df[,leb_indices] %>% summary

syr_indices = c('index_safety_syria', 'index_regime_control', 'index_econ_syria',
                'index_services_syria', 'index_networks_syria')

df[,syr_indices] %>% summary

set.seed(04038)

# Define empty vectors for saving cross-validation output
Y_CV <- c()

syr_preds_OLS <- c()
leb_preds_OLS <- c()
full_preds_OLS <- c()

syr_preds_lasso <- c()
leb_preds_lasso <- c()
full_preds_lasso <- c()


lambda_seq <- 10^seq(2, -4, by = -.05)
i = 1

for(i in 1:10){
  df_drop1 <- df[-sample(x = 1:nrow(df), size = 1),] # drop one random row to get n=3002, to achieve even test/train split
  syr_vars <- as.matrix(df_drop1[, c(syr_indices, covars2)])
  leb_vars <- as.matrix(df_drop1[, c(leb_indices, covars2)])
  full_vars <- as.matrix(df_drop1[, c(syr_indices, leb_indices, covars2)])
  y_var <- df_drop1$rtrn_head_12
  
# Splitting the data into test and train
  train = sort(sample(1:nrow(df_drop1), nrow(df_drop1)/2))
  x_test = (-train)


  

# Run OLS Models ----------------------------------------------------------

# Fit models
# Fit OLS
  pull_12m <- lm_robust(as.formula(paste("rtrn_head_12", 
                                         paste(c(syr_indices, covars2), collapse = " + "), sep = " ~ ")), 
                        clusters = location_town, weights = weights, data = df_drop1[train, ])
    
  push_12m <- lm_robust(as.formula(paste("rtrn_head_12", 
                                         paste(c(leb_indices, covars2), collapse = " + "), sep = " ~ ")), 
                        clusters = location_town, weights = weights, data = df_drop1[train, ])
  
  full_12m <- lm_robust(as.formula(paste("rtrn_head_12", 
                                         paste(c(leb_indices, syr_indices, covars2), 
                                               collapse = " + "), sep = " ~ ")), 
                        clusters = location_town, weights = weights, data = df_drop1[train, ])



# Predict
  # Syria
  pred_syr_OLS <- predict(object = pull_12m, newdata = as.data.frame(syr_vars[x_test,]))
  # pred_syr_logit <- predict(pull_12m_logit, newx = syr_vars[x_test,], type = "response")
  
  # Lebanon
  pred_leb_OLS <- predict(object = push_12m, newdata = as.data.frame(leb_vars[x_test,]))
  # pred_leb_logit <- predict(push_12m_logit, newx = syr_vars[x_test,], type = "response")
  
  # Full: Syria and Lebanon
  pred_full_OLS <- predict(object = full_12m, newdata = as.data.frame(full_vars[x_test,]))
  # pred_full_logit <- predict(full_12m_logit, newx = syr_vars[x_test,], type = "response")

# Save Output
  # Append CV-selected actual outcomes (Y)
  Y_CV <- c(Y_CV, y_var[x_test])
  
  # Append predictions (Y hat) OLS
  syr_preds_OLS <- c(syr_preds_OLS, pred_syr_OLS)
  leb_preds_OLS <- c(leb_preds_OLS, pred_leb_OLS)
  full_preds_OLS <- c(full_preds_OLS, pred_full_OLS)
  

# In loop: Run Lasso ------------------------------------------------------
  # Syria Lasso Execution ---------------------------------------------------
 
  cv_lasso_syr <- cv.glmnet(syr_vars[train,], y_var[train],
                             alpha = 1, lambda = lambda_seq, 
                             nfolds = 10, weights = df_drop1[train, 'weights'])
  
  # identifying best lamda
  best_lam_syr <- cv_lasso_syr$lambda.min
  #best_lam_syr
  
  # Rebuilding the model with best lamda value identified
  lasso_best_syr <- glmnet(syr_vars[train,], y_var[train], alpha = 1, lambda = best_lam_syr)
  pred_syr <- predict(lasso_best_syr, s = best_lam_syr, newx = syr_vars[x_test,])
  
  final_syr <- cbind(y_var[x_test], pred_syr)
  # Checking the first six obs
  
  
  # Lebanon Lasso Execution -------------------------------------------------

  cv_lasso_leb <- cv.glmnet(leb_vars[train,], y_var[train],
                             alpha = 1, lambda = lambda_seq, 
                             nfolds = 10, weights = df_drop1[train, 'weights'])
  
  # identifying best lamda
  best_lam_leb <- cv_lasso_leb$lambda.min
  #best_lam_leb
  
  # Rebuilding the model with best lamda value identified
  lasso_best_leb <- glmnet(leb_vars[train,], y_var[train], alpha = 1, lambda = best_lam_leb)
  pred_leb <- predict(lasso_best_leb, s = best_lam_leb, newx = leb_vars[x_test,])
  
  final_leb <- cbind(y_var[x_test], pred_leb)
  # Checking the first six obs
 # head(final_leb)
  

  # Full Lasso Execution ----------------------------------------------------

  cv_lasso_full <- cv.glmnet(full_vars[train,], y_var[train],
                              alpha = 1, lambda = lambda_seq, 
                              nfolds = 10, weights = df_drop1[train, 'weights'])
  
  # identifying best lamda
  best_lam_full <- cv_lasso_full$lambda.min
  #best_lam_full
  
  # Rebuilding the model with best lamda value identified
  lasso_best_full <- glmnet(full_vars[train,], y_var[train], alpha = 1, lambda = best_lam_full)
  pred_full <- predict(lasso_best_full, s = best_lam_full, newx = full_vars[x_test,])
  
  final_full <- cbind(y_var[x_test], pred_full)
  # Checking the first six obs
  
  
  
  syr_preds_lasso <- c(syr_preds_lasso, final_syr[,2])
  leb_preds_lasso <- c(leb_preds_lasso, final_leb[,2])
  full_preds_lasso <- c(full_preds_lasso, final_full[,2])
  
  
}


# Calculate OLS PR AUCs -------------------------------------------------
# Pull 
fg_pull <- syr_preds_OLS[Y_CV == 1]
bg_pull <- syr_preds_OLS[Y_CV == 0]
pr_pull <- pr.curve(scores.class0 = fg_pull, scores.class1 = bg_pull, curve = T)
pr_pull


# Push
fg_push <- leb_preds_OLS[Y_CV == 1]
bg_push <- leb_preds_OLS[Y_CV == 0]
pr_push <- pr.curve(scores.class0 = fg_push, scores.class1 = bg_push, curve = T)
pr_push


# Full
fg_full <- full_preds_OLS[Y_CV == 1]
bg_full <- full_preds_OLS[Y_CV == 0]
pr_full <- pr.curve(scores.class0 = fg_full, scores.class1 = bg_full, curve = T)
pr_full


# ROC - OLS model
### Syria 
roc.data.pull_12 <-roc(response = Y_CV, predictor = syr_preds_OLS, ci=T)

### Lebanon
roc.data.push_12 <-roc(response = Y_CV, predictor = leb_preds_OLS, ci=T)


### Full
roc.data.full_12 <-roc(response = Y_CV, predictor = full_preds_OLS, ci=T)



# Define plotting data for OLS models -------------------------------------

# Code 3-in-1 PR curve plot and 3-in-1 ROC plot
pull_PR <- data.frame(coords(roc.data.pull_12, "all", ret = c("recall", "precision"), 
                  transpose = FALSE), sensitivities = roc.data.pull_12$sensitivities, specificities = roc.data.pull_12$specificities)
push_PR <- data.frame(coords(roc.data.push_12, "all", ret = c("recall", "precision"), 
                             transpose = FALSE), sensitivities = roc.data.push_12$sensitivities, specificities = roc.data.push_12$specificities)
full_PR <- data.frame(coords(roc.data.full_12, "all", ret = c("recall", "precision"), 
                             transpose = FALSE), sensitivities = roc.data.full_12$sensitivities, specificities = roc.data.full_12$specificities)












# Lasso ROC

# ROC - Based on lasso model

### 12 month Syria Model
roc.data.pull_12m_lasso <-roc(response = Y_CV, predictor = syr_preds_lasso, ci=T)




auc(roc.data.pull_12m_lasso)




### 12 month Lebanon Model
roc.data.push_12m_lasso <-roc(response = Y_CV, predictor = leb_preds_lasso, ci=T)




auc(roc.data.push_12m_lasso)




### 12 month Full Model
roc.data.full_12m_lasso <-roc(response = Y_CV, predictor = full_preds_lasso, ci=T)



auc(roc.data.full_12m_lasso)



# Calculate Lasso PR AUCs -------------------------------------------------

# Pull 
fg_pull_lasso <- syr_preds_lasso[Y_CV == 1]
bg_pull_lasso <- syr_preds_lasso[Y_CV == 0]
pr_pull_lasso <- pr.curve(scores.class0 = fg_pull_lasso, scores.class1 = bg_pull_lasso, curve = T)
pr_pull_lasso


# Push
fg_push_lasso <- leb_preds_lasso[Y_CV == 1]
bg_push_lasso <- leb_preds_lasso[Y_CV == 0]
pr_push_lasso <- pr.curve(scores.class0 = fg_push_lasso, scores.class1 = bg_push_lasso, curve = T)
pr_push_lasso


# Full
fg_full_lasso <- full_preds_lasso[Y_CV == 1]
bg_full_lasso <- full_preds_lasso[Y_CV == 0]
pr_full_lasso <- pr.curve(scores.class0 = fg_full_lasso, scores.class1 = bg_full_lasso, curve = T)
pr_full_lasso







# Define plotting data for Lasso models -----------------------------------

pull_PR_lasso <- data.frame(coords(roc.data.pull_12m_lasso, "all", ret = c("recall", "precision"), 
                             transpose = FALSE), 
                            sensitivities = roc.data.pull_12m_lasso$sensitivities, specificities = roc.data.pull_12m_lasso$specificities)
push_PR_lasso <- data.frame(coords(roc.data.push_12m_lasso, "all", ret = c("recall", "precision"), 
                             transpose = FALSE), 
                            sensitivities = roc.data.push_12m_lasso$sensitivities, specificities = roc.data.push_12m_lasso$specificities)
full_PR_lasso <- data.frame(coords(roc.data.full_12m_lasso, "all", ret = c("recall", "precision"), 
                             transpose = FALSE), 
                            sensitivities = roc.data.full_12m_lasso$sensitivities, specificities = roc.data.full_12m_lasso$specificities)



# Report AUCs -------------------------------------------------------------


# OLS PR Curves AUCs
#Pull PR AUC OLS
ols_pr_pull = round(pr_pull$auc.integral, 3)
#Push PR AUC OLS
ols_pr_push = round(pr_push$auc.integral, 3)
#Full PR AUC OLS
ols_pr_full = round(pr_full$auc.integral, 3)



# OLS ROC AUCs
# Pull ROC AUC OLS
ols_roc_pull = round(auc(roc.data.pull_12), 3)
# Push ROC AUC OLS
ols_roc_push = round(auc(roc.data.push_12), 3)
# Full ROC AUC OLS
ols_roc_full = round(auc(roc.data.full_12), 3)




# Lasso PR Curves AUCs
#Pull PR AUC Lasso
lasso_pr_pull = round(pr_pull_lasso$auc.integral, 3)
#Push PR AUC Lasso
lasso_pr_push = round(pr_push_lasso$auc.integral, 3)
#Full PR AUC Lasso
lasso_pr_full = round(pr_full_lasso$auc.integral, 3)


# Lasso ROC AUCs
# Pull ROC AUC Lasso
lasso_roc_pull = round(auc(roc.data.pull_12m_lasso), 3)
# Push ROC AUC Lasso
lasso_roc_push = round(auc(roc.data.push_12m_lasso), 3)
# Full ROC AUC Lasso
lasso_roc_full = round(auc(roc.data.full_12m_lasso), 3)




# Plots -------------------------------------------------------------------

# Plot: 3-in-1 ggplot PR curves for OLS models ----------------------------------
pdf("figures/appendix/ols_pr.pdf")
ggplot(pull_PR) + theme_minimal() + 
  coord_equal() + 
  geom_line(aes(x = recall, y = precision, colour = paste0("Pull AUC: ", ols_pr_pull)), pull_PR) +
  geom_line(aes(x = recall, y = precision, colour = paste0("Push AUC: ", ols_pr_push)), push_PR) +
  geom_line(aes(x = recall, y = precision, colour =paste0("Full AUC: ", ols_pr_full)), full_PR) +
  scale_colour_discrete(name="Model") +
  ggtitle("PR, OLS models") 
dev.off()



# Plot: 3-in-1 ROC curves for OLS models ----------------------------------
pdf("figures/appendix/ols_roc.pdf")
ggroc(roc.data.pull_12) + theme_minimal() + 
  aes(colour = paste0("Pull AUC: ", ols_roc_pull)) +
  geom_abline(slope=1, intercept = 1, linetype = "dashed", alpha=0.7, 
              color = "grey") + 
  coord_equal() + 
  geom_line(aes(x = specificities, y = sensitivities, colour = paste0("Push AUC: ", ols_roc_push)), push_PR) +
  geom_line(aes(x = specificities, y = sensitivities, colour = paste0("Full AUC: ", ols_roc_full)), full_PR) +
  scale_colour_discrete(name="Model") +
  ggtitle("ROC, OLS models") 
dev.off()


# Lasso plots -------------------------------------------------------------


# Plot: 3-in-1 ggplot PR curves for Lasso models ----------------------------------

pdf("figures/appendix/lasso_pr.pdf")
ggplot(pull_PR_lasso) + theme_minimal() + 
  coord_equal() + 
  geom_line(aes(x = recall, y = precision, colour = paste0("Pull AUC: ", lasso_pr_pull)), pull_PR_lasso) +
  geom_line(aes(x = recall, y = precision, colour = paste0("Push AUC: ", lasso_pr_push)), push_PR_lasso) +
  geom_line(aes(x = recall, y = precision, colour = paste0("Full AUC: ", lasso_pr_full)), full_PR_lasso) +
  scale_colour_discrete(name="Model") +
  ggtitle("PR, Lasso models") 
dev.off()

# Plot: 3-in-1 ROC curves for Lasso models ----------------------------------
pdf("figures/appendix/lasso_roc.pdf")
ggroc(roc.data.pull_12m_lasso) + theme_minimal() + 
  aes(colour = paste0("Pull AUC: ", lasso_roc_pull)) +
  geom_abline(slope=1, intercept = 1, linetype = "dashed", alpha=0.7, 
              color = "grey") + 
  coord_equal() + 
  geom_line(aes(x = specificities, y = sensitivities, colour = paste0("Push AUC: ", lasso_roc_push)), push_PR_lasso) +
  geom_line(aes(x = specificities, y = sensitivities, colour = paste0("Full AUC: ", lasso_roc_full)), full_PR_lasso) +
  scale_colour_discrete(name="Model") +
  ggtitle("ROC, Lasso models") 
dev.off()





